{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#top 5k dice keywords\n",
    "# set to none to use phrases only\n",
    "KEY_WORDS_FILE = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\"\n",
    "# set to none to use keywords only\n",
    "PHRASES_FILE = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt\"\n",
    "VECTOR_SYNONYMS_FILE  = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/vector_synonyms.txt\"\n",
    "MODEL_FILE     = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "#Shared\n",
    "#just used to load phrases file\n",
    "def load_stop_words(stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "            for line in f:\n",
    "                word = line.strip()\n",
    "                if word[0] != \"#\":\n",
    "                    word = word.lower()\n",
    "                    stop_words.add(word)\n",
    "    return stop_words\n",
    "\n",
    "def get_vector(item, model):\n",
    "    vocab = model.vocab[item]\n",
    "    vector = model.syn0[vocab.index]\n",
    "    return vector\n",
    "\n",
    "def get_norm_vector(item, model):\n",
    "    if item not in model.vocab:\n",
    "        return None\n",
    "    # for deserialized models, the norm vectors are not stored\n",
    "    vec = get_vector(item, model)\n",
    "    norm = np.linalg.norm(vec)\n",
    "    if norm != 0:\n",
    "        return vec / norm\n",
    "    return vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#functions\n",
    "def is_valid_search_keyword(kw):\n",
    "    q_kw = \" \" + kw + \" \"\n",
    "    for wd in \"(,), and , or , not , true , TRUE , false , FALSE \".split(\",\"):\n",
    "        if wd in q_kw:\n",
    "            return False\n",
    "    # remove queries with negations in them\n",
    "    tokens = kw.split(\" \")\n",
    "    \n",
    "    # remove single char keywords\n",
    "    if len(tokens) == 1 and len(tokens[0]) == 1:\n",
    "        if tokens[0].isalpha():\n",
    "            return True\n",
    "        return False\n",
    "    \n",
    "    if any(map(lambda t: t.strip().startswith(\"-\"), tokens)):\n",
    "        return False\n",
    "    return True\n",
    "\n",
    "def map_keyword(kw):\n",
    "    return kw.replace(\" \", \"_\")\n",
    "\n",
    "def vectors_to_file(fname, terms, model):\n",
    "    with open(fname, \"w+\") as f:\n",
    "        for term in terms:\n",
    "            vec = get_norm_vector(term, model)\n",
    "            if vec is not None: #in model\n",
    "                f.write(\"%s=>\" % term)\n",
    "                for i, val in enumerate(vec):\n",
    "                    # left pad the string so the same number of characters\n",
    "                    f.write(\"%s|%f \" %(str(i).rjust(3, \"0\"),val))\n",
    "                f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gensim, time\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "\n",
    "model = Word2Vec.load(MODEL_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24785"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if PHRASES_FILE is not None:\n",
    "    phrases = load_stop_words(PHRASES_FILE)\n",
    "else:\n",
    "    phrases = set()\n",
    "len(phrases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4047 keywords loaded from /Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\n"
     ]
    }
   ],
   "source": [
    "un_keywords = set()\n",
    "if KEY_WORDS_FILE is not None:\n",
    "    with open(KEY_WORDS_FILE) as f:\n",
    "        for line in f:\n",
    "            kw = line.strip()\n",
    "            if len(kw) > 0 and is_valid_search_keyword(kw):\n",
    "                un_keywords.add(kw)\n",
    "    print(\"%i keywords loaded from %s\" % (len(un_keywords), KEY_WORDS_FILE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(27125, 'total terms')\n"
     ]
    }
   ],
   "source": [
    "all_terms = un_keywords.union(phrases)\n",
    "print(len(all_terms), \"total terms\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vectors_to_file(VECTOR_SYNONYMS_FILE, all_terms, model)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
